DATA PREPROCESSING
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [5]:
#Import the dataset.
df=pd.read_csv("Employee-atrition.csv")
In [6]:
df.head()
Out[6]:
|
Age |
Attrition |
BusinessTravel |
DailyRate |
Department |
DistanceFromHome |
Education |
EducationField |
EmployeeCount |
EmployeeNumber |
... |
RelationshipSatisfaction |
StandardHours |
StockOptionLevel |
TotalWorkingYears |
TrainingTimesLastYear |
WorkLifeBalance |
YearsAtCompany |
YearsInCurrentRole |
YearsSinceLastPromotion |
YearsWithCurrManager |
|
|
0 |
41 |
Yes |
Travel_Rarely |
1102 |
Sales |
1 |
2 |
Life Sciences |
1 |
1 |
... |
1 |
80 |
0 |
8 |
0 |
1 |
6 |
4 |
0 |
5 |
|
1 |
49 |
No |
Travel_Frequently |
279 |
Research & Development |
8 |
1 |
Life Sciences |
1 |
2 |
... |
4 |
80 |
1 |
10 |
3 |
3 |
10 |
7 |
1 |
7 |
|
2 |
37 |
Yes |
Travel_Rarely |
1373 |
Research & Development |
2 |
2 |
Other |
1 |
4 |
... |
2 |
80 |
0 |
7 |
3 |
3 |
0 |
0 |
0 |
0 |
|
3 |
33 |
No |
Travel_Frequently |
1392 |
Research & Development |
3 |
4 |
Life Sciences |
1 |
5 |
... |
3 |
80 |
0 |
8 |
3 |
3 |
8 |
7 |
3 |
0 |
|
4 |
27 |
No |
Travel_Rarely |
591 |
Research & Development |
2 |
1 |
Medical |
1 |
7 |
... |
4 |
80 |
1 |
6 |
3 |
3 |
2 |
2 |
2 |
2 |
5 rows × 35 columns
In [7]:
df.shape
Out[7]:
(1470, 35)In [8]:
df.info()
<class 'pandas.core.frame.DataFrame'>RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 1470 non-null int64 1 Attrition 1470 non-null object 2 BusinessTravel 1470 non-null object 3 DailyRate 1470 non-null int64 4 Department 1470 non-null object 5 DistanceFromHome 1470 non-null int64 6 Education 1470 non-null int64 7 EducationField 1470 non-null object 8 EmployeeCount 1470 non-null int64 9 EmployeeNumber 1470 non-null int64 10 EnvironmentSatisfaction 1470 non-null int64 11 Gender 1470 non-null object 12 HourlyRate 1470 non-null int64 13 JobInvolvement 1470 non-null int64 14 JobLevel 1470 non-null int64 15 JobRole 1470 non-null object 16 JobSatisfaction 1470 non-null int64 17 MaritalStatus 1470 non-null object 18 MonthlyIncome 1470 non-null int64 19 MonthlyRate 1470 non-null int64 20 NumCompaniesWorked 1470 non-null int64 21 Over18 1470 non-null object 22 OverTime 1470 non-null object 23 PercentSalaryHike 1470 non-null int64 24 PerformanceRating 1470 non-null int64 25 RelationshipSatisfaction 1470 non-null int64 26 StandardHours 1470 non-null int64 27 StockOptionLevel 1470 non-null int64 28 TotalWorkingYears 1470 non-null int64 29 TrainingTimesLastYear 1470 non-null int64 30 WorkLifeBalance 1470 non-null int64 31 YearsAtCompany 1470 non-null int64 32 YearsInCurrentRole 1470 non-null int64 33 YearsSinceLastPromotion 1470 non-null int64 34 YearsWithCurrManager 1470 non-null int64 dtypes: int64(26), object(9)
memory usage: 402.1+ KBIn [9]:
df.describe()
Out[9]:
|
Age |
DailyRate |
DistanceFromHome |
Education |
EmployeeCount |
EmployeeNumber |
EnvironmentSatisfaction |
HourlyRate |
JobInvolvement |
JobLevel |
... |
RelationshipSatisfaction |
StandardHours |
StockOptionLevel |
TotalWorkingYears |
TrainingTimesLastYear |
WorkLifeBalance |
YearsAtCompany |
YearsInCurrentRole |
YearsSinceLastPromotion |
YearsWithCurrManager |
|
|
count |
1470.000000 |
1470.000000 |
1470.000000 |
1470.000000 |
1470.0 |
1470.000000 |
1470.000000 |
1470.000000 |
1470.000000 |
1470.000000 |
... |
1470.000000 |
1470.0 |
1470.000000 |
1470.000000 |
1470.000000 |
1470.000000 |
1470.000000 |
1470.000000 |
1470.000000 |
1470.000000 |
|
mean |
36.923810 |
802.485714 |
9.192517 |
2.912925 |
1.0 |
1024.865306 |
2.721769 |
65.891156 |
2.729932 |
2.063946 |
... |
2.712245 |
80.0 |
0.793878 |
11.279592 |
2.799320 |
2.761224 |
7.008163 |
4.229252 |
2.187755 |
4.123129 |
|
std |
9.135373 |
403.509100 |
8.106864 |
1.024165 |
0.0 |
602.024335 |
1.093082 |
20.329428 |
0.711561 |
1.106940 |
... |
1.081209 |
0.0 |
0.852077 |
7.780782 |
1.289271 |
0.706476 |
6.126525 |
3.623137 |
3.222430 |
3.568136 |
|
min |
18.000000 |
102.000000 |
1.000000 |
1.000000 |
1.0 |
1.000000 |
1.000000 |
30.000000 |
1.000000 |
1.000000 |
... |
1.000000 |
80.0 |
0.000000 |
0.000000 |
0.000000 |
1.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
|
25% |
30.000000 |
465.000000 |
2.000000 |
2.000000 |
1.0 |
491.250000 |
2.000000 |
48.000000 |
2.000000 |
1.000000 |
... |
2.000000 |
80.0 |
0.000000 |
6.000000 |
2.000000 |
2.000000 |
3.000000 |
2.000000 |
0.000000 |
2.000000 |
|
50% |
36.000000 |
802.000000 |
7.000000 |
3.000000 |
1.0 |
1020.500000 |
3.000000 |
66.000000 |
3.000000 |
2.000000 |
... |
3.000000 |
80.0 |
1.000000 |
10.000000 |
3.000000 |
3.000000 |
5.000000 |
3.000000 |
1.000000 |
3.000000 |
|
75% |
43.000000 |
1157.000000 |
14.000000 |
4.000000 |
1.0 |
1555.750000 |
4.000000 |
83.750000 |
3.000000 |
3.000000 |
... |
4.000000 |
80.0 |
1.000000 |
15.000000 |
3.000000 |
3.000000 |
9.000000 |
7.000000 |
3.000000 |
7.000000 |
|
max |
60.000000 |
1499.000000 |
29.000000 |
5.000000 |
1.0 |
2068.000000 |
4.000000 |
100.000000 |
4.000000 |
5.000000 |
... |
4.000000 |
80.0 |
3.000000 |
40.000000 |
6.000000 |
4.000000 |
40.000000 |
18.000000 |
15.000000 |
17.000000 |
8 rows × 26 columns
In [10]:
df.describe()
Out[10]:
|
Age |
DailyRate |
DistanceFromHome |
Education |
EmployeeCount |
EmployeeNumber |
EnvironmentSatisfaction |
HourlyRate |
JobInvolvement |
JobLevel |
... |
RelationshipSatisfaction |
StandardHours |
StockOptionLevel |
TotalWorkingYears |
TrainingTimesLastYear |
WorkLifeBalance |
YearsAtCompany |
YearsInCurrentRole |
YearsSinceLastPromotion |
YearsWithCurrManager |
|
|
count |
1470.000000 |
1470.000000 |
1470.000000 |
1470.000000 |
1470.0 |
1470.000000 |
1470.000000 |
1470.000000 |
1470.000000 |
1470.000000 |
... |
1470.000000 |
1470.0 |
1470.000000 |
1470.000000 |
1470.000000 |
1470.000000 |
1470.000000 |
1470.000000 |
1470.000000 |
1470.000000 |
|
mean |
36.923810 |
802.485714 |
9.192517 |
2.912925 |
1.0 |
1024.865306 |
2.721769 |
65.891156 |
2.729932 |
2.063946 |
... |
2.712245 |
80.0 |
0.793878 |
11.279592 |
2.799320 |
2.761224 |
7.008163 |
4.229252 |
2.187755 |
4.123129 |
|
std |
9.135373 |
403.509100 |
8.106864 |
1.024165 |
0.0 |
602.024335 |
1.093082 |
20.329428 |
0.711561 |
1.106940 |
... |
1.081209 |
0.0 |
0.852077 |
7.780782 |
1.289271 |
0.706476 |
6.126525 |
3.623137 |
3.222430 |
3.568136 |
|
min |
18.000000 |
102.000000 |
1.000000 |
1.000000 |
1.0 |
1.000000 |
1.000000 |
30.000000 |
1.000000 |
1.000000 |
... |
1.000000 |
80.0 |
0.000000 |
0.000000 |
0.000000 |
1.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
|
25% |
30.000000 |
465.000000 |
2.000000 |
2.000000 |
1.0 |
491.250000 |
2.000000 |
48.000000 |
2.000000 |
1.000000 |
... |
2.000000 |
80.0 |
0.000000 |
6.000000 |
2.000000 |
2.000000 |
3.000000 |
2.000000 |
0.000000 |
2.000000 |
|
50% |
36.000000 |
802.000000 |
7.000000 |
3.000000 |
1.0 |
1020.500000 |
3.000000 |
66.000000 |
3.000000 |
2.000000 |
... |
3.000000 |
80.0 |
1.000000 |
10.000000 |
3.000000 |
3.000000 |
5.000000 |
3.000000 |
1.000000 |
3.000000 |
|
75% |
43.000000 |
1157.000000 |
14.000000 |
4.000000 |
1.0 |
1555.750000 |
4.000000 |
83.750000 |
3.000000 |
3.000000 |
... |
4.000000 |
80.0 |
1.000000 |
15.000000 |
3.000000 |
3.000000 |
9.000000 |
7.000000 |
3.000000 |
7.000000 |
|
max |
60.000000 |
1499.000000 |
29.000000 |
5.000000 |
1.0 |
2068.000000 |
4.000000 |
100.000000 |
4.000000 |
5.000000 |
... |
4.000000 |
80.0 |
3.000000 |
40.000000 |
6.000000 |
4.000000 |
40.000000 |
18.000000 |
15.000000 |
17.000000 |
8 rows × 26 columns
In [11]:
df.Attrition.value_counts()
Out[11]:
No 1233Yes 237Name: Attrition, dtype: int64In [ ]:
#Check for null values
In [12]:
df.isnull().sum()
Out[12]:
Age 0Attrition 0BusinessTravel 0
DailyRate 0
Department 0DistanceFromHome 0
Education 0EducationField 0
EmployeeCount 0
EmployeeNumber 0
EnvironmentSatisfaction 0
Gender 0HourlyRate 0
JobInvolvement 0
JobLevel 0
JobRole 0
JobSatisfaction 0
MaritalStatus 0
MonthlyIncome 0
MonthlyRate 0
NumCompaniesWorked 0
Over18 0OverTime 0
PercentSalaryHike 0
PerformanceRating 0
RelationshipSatisfaction 0
StandardHours 0
StockOptionLevel 0
TotalWorkingYears 0
TrainingTimesLastYear 0
WorkLifeBalance 0
YearsAtCompany 0
YearsInCurrentRole 0
YearsSinceLastPromotion 0
YearsWithCurrManager 0
dtype: int64
In [13]:
sns.displot(df['Age'])
Out[13]:
<seaborn.axisgrid.FacetGrid at 0x15c45bb36d0>
In [14]:
df.corr()
Out[14]:
|
Age |
DailyRate |
DistanceFromHome |
Education |
EmployeeCount |
EmployeeNumber |
EnvironmentSatisfaction |
HourlyRate |
JobInvolvement |
JobLevel |
... |
RelationshipSatisfaction |
StandardHours |
StockOptionLevel |
TotalWorkingYears |
TrainingTimesLastYear |
WorkLifeBalance |
YearsAtCompany |
YearsInCurrentRole |
YearsSinceLastPromotion |
YearsWithCurrManager |
|
|
Age |
1.000000 |
0.010661 |
-0.001686 |
0.208034 |
NaN |
-0.010145 |
0.010146 |
0.024287 |
0.029820 |
0.509604 |
... |
0.053535 |
NaN |
0.037510 |
0.680381 |
-0.019621 |
-0.021490 |
0.311309 |
0.212901 |
0.216513 |
0.202089 |
|
DailyRate |
0.010661 |
1.000000 |
-0.004985 |
-0.016806 |
NaN |
-0.050990 |
0.018355 |
0.023381 |
0.046135 |
0.002966 |
... |
0.007846 |
NaN |
0.042143 |
0.014515 |
0.002453 |
-0.037848 |
-0.034055 |
0.009932 |
-0.033229 |
-0.026363 |
|
DistanceFromHome |
-0.001686 |
-0.004985 |
1.000000 |
0.021042 |
NaN |
0.032916 |
-0.016075 |
0.031131 |
0.008783 |
0.005303 |
... |
0.006557 |
NaN |
0.044872 |
0.004628 |
-0.036942 |
-0.026556 |
0.009508 |
0.018845 |
0.010029 |
0.014406 |
|
Education |
0.208034 |
-0.016806 |
0.021042 |
1.000000 |
NaN |
0.042070 |
-0.027128 |
0.016775 |
0.042438 |
0.101589 |
... |
-0.009118 |
NaN |
0.018422 |
0.148280 |
-0.025100 |
0.009819 |
0.069114 |
0.060236 |
0.054254 |
0.069065 |
|
EmployeeCount |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
|
EmployeeNumber |
-0.010145 |
-0.050990 |
0.032916 |
0.042070 |
NaN |
1.000000 |
0.017621 |
0.035179 |
-0.006888 |
-0.018519 |
... |
-0.069861 |
NaN |
0.062227 |
-0.014365 |
0.023603 |
0.010309 |
-0.011240 |
-0.008416 |
-0.009019 |
-0.009197 |
|
EnvironmentSatisfaction |
0.010146 |
0.018355 |
-0.016075 |
-0.027128 |
NaN |
0.017621 |
1.000000 |
-0.049857 |
-0.008278 |
0.001212 |
... |
0.007665 |
NaN |
0.003432 |
-0.002693 |
-0.019359 |
0.027627 |
0.001458 |
0.018007 |
0.016194 |
-0.004999 |
|
HourlyRate |
0.024287 |
0.023381 |
0.031131 |
0.016775 |
NaN |
0.035179 |
-0.049857 |
1.000000 |
0.042861 |
-0.027853 |
... |
0.001330 |
NaN |
0.050263 |
-0.002334 |
-0.008548 |
-0.004607 |
-0.019582 |
-0.024106 |
-0.026716 |
-0.020123 |
|
JobInvolvement |
0.029820 |
0.046135 |
0.008783 |
0.042438 |
NaN |
-0.006888 |
-0.008278 |
0.042861 |
1.000000 |
-0.012630 |
... |
0.034297 |
NaN |
0.021523 |
-0.005533 |
-0.015338 |
-0.014617 |
-0.021355 |
0.008717 |
-0.024184 |
0.025976 |
|
JobLevel |
0.509604 |
0.002966 |
0.005303 |
0.101589 |
NaN |
-0.018519 |
0.001212 |
-0.027853 |
-0.012630 |
1.000000 |
... |
0.021642 |
NaN |
0.013984 |
0.782208 |
-0.018191 |
0.037818 |
0.534739 |
0.389447 |
0.353885 |
0.375281 |
|
JobSatisfaction |
-0.004892 |
0.030571 |
-0.003669 |
-0.011296 |
NaN |
-0.046247 |
-0.006784 |
-0.071335 |
-0.021476 |
-0.001944 |
... |
-0.012454 |
NaN |
0.010690 |
-0.020185 |
-0.005779 |
-0.019459 |
-0.003803 |
-0.002305 |
-0.018214 |
-0.027656 |
|
MonthlyIncome |
0.497855 |
0.007707 |
-0.017014 |
0.094961 |
NaN |
-0.014829 |
-0.006259 |
-0.015794 |
-0.015271 |
0.950300 |
... |
0.025873 |
NaN |
0.005408 |
0.772893 |
-0.021736 |
0.030683 |
0.514285 |
0.363818 |
0.344978 |
0.344079 |
|
MonthlyRate |
0.028051 |
-0.032182 |
0.027473 |
-0.026084 |
NaN |
0.012648 |
0.037600 |
-0.015297 |
-0.016322 |
0.039563 |
... |
-0.004085 |
NaN |
-0.034323 |
0.026442 |
0.001467 |
0.007963 |
-0.023655 |
-0.012815 |
0.001567 |
-0.036746 |
|
NumCompaniesWorked |
0.299635 |
0.038153 |
-0.029251 |
0.126317 |
NaN |
-0.001251 |
0.012594 |
0.022157 |
0.015012 |
0.142501 |
... |
0.052733 |
NaN |
0.030075 |
0.237639 |
-0.066054 |
-0.008366 |
-0.118421 |
-0.090754 |
-0.036814 |
-0.110319 |
|
PercentSalaryHike |
0.003634 |
0.022704 |
0.040235 |
-0.011111 |
NaN |
-0.012944 |
-0.031701 |
-0.009062 |
-0.017205 |
-0.034730 |
... |
-0.040490 |
NaN |
0.007528 |
-0.020608 |
-0.005221 |
-0.003280 |
-0.035991 |
-0.001520 |
-0.022154 |
-0.011985 |
|
PerformanceRating |
0.001904 |
0.000473 |
0.027110 |
-0.024539 |
NaN |
-0.020359 |
-0.029548 |
-0.002172 |
-0.029071 |
-0.021222 |
... |
-0.031351 |
NaN |
0.003506 |
0.006744 |
-0.015579 |
0.002572 |
0.003435 |
0.034986 |
0.017896 |
0.022827 |
|
RelationshipSatisfaction |
0.053535 |
0.007846 |
0.006557 |
-0.009118 |
NaN |
-0.069861 |
0.007665 |
0.001330 |
0.034297 |
0.021642 |
... |
1.000000 |
NaN |
-0.045952 |
0.024054 |
0.002497 |
0.019604 |
0.019367 |
-0.015123 |
0.033493 |
-0.000867 |
|
StandardHours |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
|
StockOptionLevel |
0.037510 |
0.042143 |
0.044872 |
0.018422 |
NaN |
0.062227 |
0.003432 |
0.050263 |
0.021523 |
0.013984 |
... |
-0.045952 |
NaN |
1.000000 |
0.010136 |
0.011274 |
0.004129 |
0.015058 |
0.050818 |
0.014352 |
0.024698 |
|
TotalWorkingYears |
0.680381 |
0.014515 |
0.004628 |
0.148280 |
NaN |
-0.014365 |
-0.002693 |
-0.002334 |
-0.005533 |
0.782208 |
... |
0.024054 |
NaN |
0.010136 |
1.000000 |
-0.035662 |
0.001008 |
0.628133 |
0.460365 |
0.404858 |
0.459188 |
|
TrainingTimesLastYear |
-0.019621 |
0.002453 |
-0.036942 |
-0.025100 |
NaN |
0.023603 |
-0.019359 |
-0.008548 |
-0.015338 |
-0.018191 |
... |
0.002497 |
NaN |
0.011274 |
-0.035662 |
1.000000 |
0.028072 |
0.003569 |
-0.005738 |
-0.002067 |
-0.004096 |
|
WorkLifeBalance |
-0.021490 |
-0.037848 |
-0.026556 |
0.009819 |
NaN |
0.010309 |
0.027627 |
-0.004607 |
-0.014617 |
0.037818 |
... |
0.019604 |
NaN |
0.004129 |
0.001008 |
0.028072 |
1.000000 |
0.012089 |
0.049856 |
0.008941 |
0.002759 |
|
YearsAtCompany |
0.311309 |
-0.034055 |
0.009508 |
0.069114 |
NaN |
-0.011240 |
0.001458 |
-0.019582 |
-0.021355 |
0.534739 |
... |
0.019367 |
NaN |
0.015058 |
0.628133 |
0.003569 |
0.012089 |
1.000000 |
0.758754 |
0.618409 |
0.769212 |
|
YearsInCurrentRole |
0.212901 |
0.009932 |
0.018845 |
0.060236 |
NaN |
-0.008416 |
0.018007 |
-0.024106 |
0.008717 |
0.389447 |
... |
-0.015123 |
NaN |
0.050818 |
0.460365 |
-0.005738 |
0.049856 |
0.758754 |
1.000000 |
0.548056 |
0.714365 |
|
YearsSinceLastPromotion |
0.216513 |
-0.033229 |
0.010029 |
0.054254 |
NaN |
-0.009019 |
0.016194 |
-0.026716 |
-0.024184 |
0.353885 |
... |
0.033493 |
NaN |
0.014352 |
0.404858 |
-0.002067 |
0.008941 |
0.618409 |
0.548056 |
1.000000 |
0.510224 |
|
YearsWithCurrManager |
0.202089 |
-0.026363 |
0.014406 |
0.069065 |
NaN |
-0.009197 |
-0.004999 |
-0.020123 |
0.025976 |
0.375281 |
... |
-0.000867 |
NaN |
0.024698 |
0.459188 |
-0.004096 |
0.002759 |
0.769212 |
0.714365 |
0.510224 |
1.000000 |
26 rows × 26 columns
In [15]:
sns.boxplot(df.Age)
C:\Users\mahes\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(Out[15]:
<AxesSubplot:xlabel='Age'>
In [16]:
df.head()
Out[16]:
|
Age |
Attrition |
BusinessTravel |
DailyRate |
Department |
DistanceFromHome |
Education |
EducationField |
EmployeeCount |
EmployeeNumber |
... |
RelationshipSatisfaction |
StandardHours |
StockOptionLevel |
TotalWorkingYears |
TrainingTimesLastYear |
WorkLifeBalance |
YearsAtCompany |
YearsInCurrentRole |
YearsSinceLastPromotion |
YearsWithCurrManager |
|
|
0 |
41 |
Yes |
Travel_Rarely |
1102 |
Sales |
1 |
2 |
Life Sciences |
1 |
1 |
... |
1 |
80 |
0 |
8 |
0 |
1 |
6 |
4 |
0 |
5 |
|
1 |
49 |
No |
Travel_Frequently |
279 |
Research & Development |
8 |
1 |
Life Sciences |
1 |
2 |
... |
4 |
80 |
1 |
10 |
3 |
3 |
10 |
7 |
1 |
7 |
|
2 |
37 |
Yes |
Travel_Rarely |
1373 |
Research & Development |
2 |
2 |
Other |
1 |
4 |
... |
2 |
80 |
0 |
7 |
3 |
3 |
0 |
0 |
0 |
0 |
|
3 |
33 |
No |
Travel_Frequently |
1392 |
Research & Development |
3 |
4 |
Life Sciences |
1 |
5 |
... |
3 |
80 |
0 |
8 |
3 |
3 |
8 |
7 |
3 |
0 |
|
4 |
27 |
No |
Travel_Rarely |
591 |
Research & Development |
2 |
1 |
Medical |
1 |
7 |
... |
4 |
80 |
1 |
6 |
3 |
3 |
2 |
2 |
2 |
2 |
5 rows × 35 columns
In [17]:
df
Out[17]:
|
Age |
Attrition |
BusinessTravel |
DailyRate |
Department |
DistanceFromHome |
Education |
EducationField |
EmployeeCount |
EmployeeNumber |
... |
RelationshipSatisfaction |
StandardHours |
StockOptionLevel |
TotalWorkingYears |
TrainingTimesLastYear |
WorkLifeBalance |
YearsAtCompany |
YearsInCurrentRole |
YearsSinceLastPromotion |
YearsWithCurrManager |
|
|
0 |
41 |
Yes |
Travel_Rarely |
1102 |
Sales |
1 |
2 |
Life Sciences |
1 |
1 |
... |
1 |
80 |
0 |
8 |
0 |
1 |
6 |
4 |
0 |
5 |
|
1 |
49 |
No |
Travel_Frequently |
279 |
Research & Development |
8 |
1 |
Life Sciences |
1 |
2 |
... |
4 |
80 |
1 |
10 |
3 |
3 |
10 |
7 |
1 |
7 |
|
2 |
37 |
Yes |
Travel_Rarely |
1373 |
Research & Development |
2 |
2 |
Other |
1 |
4 |
... |
2 |
80 |
0 |
7 |
3 |
3 |
0 |
0 |
0 |
0 |
|
3 |
33 |
No |
Travel_Frequently |
1392 |
Research & Development |
3 |
4 |
Life Sciences |
1 |
5 |
... |
3 |
80 |
0 |
8 |
3 |
3 |
8 |
7 |
3 |
0 |
|
4 |
27 |
No |
Travel_Rarely |
591 |
Research & Development |
2 |
1 |
Medical |
1 |
7 |
... |
4 |
80 |
1 |
6 |
3 |
3 |
2 |
2 |
2 |
2 |
|
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
|
1465 |
36 |
No |
Travel_Frequently |
884 |
Research & Development |
23 |
2 |
Medical |
1 |
2061 |
... |
3 |
80 |
1 |
17 |
3 |
3 |
5 |
2 |
0 |
3 |
|
1466 |
39 |
No |
Travel_Rarely |
613 |
Research & Development |
6 |
1 |
Medical |
1 |
2062 |
... |
1 |
80 |
1 |
9 |
5 |
3 |
7 |
7 |
1 |
7 |
|
1467 |
27 |
No |
Travel_Rarely |
155 |
Research & Development |
4 |
3 |
Life Sciences |
1 |
2064 |
... |
2 |
80 |
1 |
6 |
0 |
3 |
6 |
2 |
0 |
3 |
|
1468 |
49 |
No |
Travel_Frequently |
1023 |
Sales |
2 |
3 |
Medical |
1 |
2065 |
... |
4 |
80 |
0 |
17 |
3 |
2 |
9 |
6 |
0 |
8 |
|
1469 |
34 |
No |
Travel_Rarely |
628 |
Research & Development |
8 |
3 |
Medical |
1 |
2068 |
... |
1 |
80 |
0 |
6 |
3 |
4 |
4 |
3 |
1 |
2 |
1470 rows × 35 columns
In [ ]:
#Splitting Dependent and Independent variables
#Dependant - Attrition
#Independant - All others
In [20]:
x = df.drop(columns= ['Attrition'], axis=1)
x.head()
Out[20]:
|
Age |
BusinessTravel |
DailyRate |
Department |
DistanceFromHome |
Education |
EducationField |
EmployeeCount |
EmployeeNumber |
EnvironmentSatisfaction |
... |
RelationshipSatisfaction |
StandardHours |
StockOptionLevel |
TotalWorkingYears |
TrainingTimesLastYear |
WorkLifeBalance |
YearsAtCompany |
YearsInCurrentRole |
YearsSinceLastPromotion |
YearsWithCurrManager |
|
|
0 |
41 |
Travel_Rarely |
1102 |
Sales |
1 |
2 |
Life Sciences |
1 |
1 |
2 |
... |
1 |
80 |
0 |
8 |
0 |
1 |
6 |
4 |
0 |
5 |
|
1 |
49 |
Travel_Frequently |
279 |
Research & Development |
8 |
1 |
Life Sciences |
1 |
2 |
3 |
... |
4 |
80 |
1 |
10 |
3 |
3 |
10 |
7 |
1 |
7 |
|
2 |
37 |
Travel_Rarely |
1373 |
Research & Development |
2 |
2 |
Other |
1 |
4 |
4 |
... |
2 |
80 |
0 |
7 |
3 |
3 |
0 |
0 |
0 |
0 |
|
3 |
33 |
Travel_Frequently |
1392 |
Research & Development |
3 |
4 |
Life Sciences |
1 |
5 |
4 |
... |
3 |
80 |
0 |
8 |
3 |
3 |
8 |
7 |
3 |
0 |
|
4 |
27 |
Travel_Rarely |
591 |
Research & Development |
2 |
1 |
Medical |
1 |
7 |
1 |
... |
4 |
80 |
1 |
6 |
3 |
3 |
2 |
2 |
2 |
2 |
5 rows × 34 columns
In [21]:
y = df.Attrition
y.head
Out[21]:
<bound method NDFrame.head of 0 Yes1 No2 Yes3 No4 No ... 1465 No1466 No1467 No1468 No1469 NoName: Attrition, Length: 1470, dtype: object>In [ ]:
#Label Encoding
In [22]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
x.BusinessTravel=le.fit_transform(x.BusinessTravel)
x.Department=le.fit_transform(x.Department)
x.EducationField=le.fit_transform(x.EducationField)
x.Gender=le.fit_transform(x.Gender)
x.JobRole=le.fit_transform(x.JobRole)
x.MaritalStatus=le.fit_transform(x.MaritalStatus)
x.Over18=le.fit_transform(x.Over18)
x.OverTime=le.fit_transform(x.OverTime)
x.head()
Out[22]:
|
Age |
BusinessTravel |
DailyRate |
Department |
DistanceFromHome |
Education |
EducationField |
EmployeeCount |
EmployeeNumber |
EnvironmentSatisfaction |
... |
RelationshipSatisfaction |
StandardHours |
StockOptionLevel |
TotalWorkingYears |
TrainingTimesLastYear |
WorkLifeBalance |
YearsAtCompany |
YearsInCurrentRole |
YearsSinceLastPromotion |
YearsWithCurrManager |
|
|
0 |
41 |
2 |
1102 |
2 |
1 |
2 |
1 |
1 |
1 |
2 |
... |
1 |
80 |
0 |
8 |
0 |
1 |
6 |
4 |
0 |
5 |
|
1 |
49 |
1 |
279 |
1 |
8 |
1 |
1 |
1 |
2 |
3 |
... |
4 |
80 |
1 |
10 |
3 |
3 |
10 |
7 |
1 |
7 |
|
2 |
37 |
2 |
1373 |
1 |
2 |
2 |
4 |
1 |
4 |
4 |
... |
2 |
80 |
0 |
7 |
3 |
3 |
0 |
0 |
0 |
0 |
|
3 |
33 |
1 |
1392 |
1 |
3 |
4 |
1 |
1 |
5 |
4 |
... |
3 |
80 |
0 |
8 |
3 |
3 |
8 |
7 |
3 |
0 |
|
4 |
27 |
2 |
591 |
1 |
2 |
1 |
3 |
1 |
7 |
1 |
... |
4 |
80 |
1 |
6 |
3 |
3 |
2 |
2 |
2 |
2 |
5 rows × 34 columns
In [ ]:
#Feature Scaling using MINMAX
In [23]:
from sklearn.preprocessing import MinMaxScaler
ms = MinMaxScaler()
x_Scaled= pd.DataFrame(ms.fit_transform(x),columns=x.columns)
x_Scaled
Out[23]:
|
Age |
BusinessTravel |
DailyRate |
Department |
DistanceFromHome |
Education |
EducationField |
EmployeeCount |
EmployeeNumber |
EnvironmentSatisfaction |
... |
RelationshipSatisfaction |
StandardHours |
StockOptionLevel |
TotalWorkingYears |
TrainingTimesLastYear |
WorkLifeBalance |
YearsAtCompany |
YearsInCurrentRole |
YearsSinceLastPromotion |
YearsWithCurrManager |
|
|
0 |
0.547619 |
1.0 |
0.715820 |
1.0 |
0.000000 |
0.25 |
0.2 |
0.0 |
0.000000 |
0.333333 |
... |
0.000000 |
0.0 |
0.000000 |
0.200 |
0.000000 |
0.000000 |
0.150 |
0.222222 |
0.000000 |
0.294118 |
|
1 |
0.738095 |
0.5 |
0.126700 |
0.5 |
0.250000 |
0.00 |
0.2 |
0.0 |
0.000484 |
0.666667 |
... |
1.000000 |
0.0 |
0.333333 |
0.250 |
0.500000 |
0.666667 |
0.250 |
0.388889 |
0.066667 |
0.411765 |
|
2 |
0.452381 |
1.0 |
0.909807 |
0.5 |
0.035714 |
0.25 |
0.8 |
0.0 |
0.001451 |
1.000000 |
... |
0.333333 |
0.0 |
0.000000 |
0.175 |
0.500000 |
0.666667 |
0.000 |
0.000000 |
0.000000 |
0.000000 |
|
3 |
0.357143 |
0.5 |
0.923407 |
0.5 |
0.071429 |
0.75 |
0.2 |
0.0 |
0.001935 |
1.000000 |
... |
0.666667 |
0.0 |
0.000000 |
0.200 |
0.500000 |
0.666667 |
0.200 |
0.388889 |
0.200000 |
0.000000 |
|
4 |
0.214286 |
1.0 |
0.350036 |
0.5 |
0.035714 |
0.00 |
0.6 |
0.0 |
0.002903 |
0.000000 |
... |
1.000000 |
0.0 |
0.333333 |
0.150 |
0.500000 |
0.666667 |
0.050 |
0.111111 |
0.133333 |
0.117647 |
|
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
|
1465 |
0.428571 |
0.5 |
0.559771 |
0.5 |
0.785714 |
0.25 |
0.6 |
0.0 |
0.996613 |
0.666667 |
... |
0.666667 |
0.0 |
0.333333 |
0.425 |
0.500000 |
0.666667 |
0.125 |
0.111111 |
0.000000 |
0.176471 |
|
1466 |
0.500000 |
1.0 |
0.365784 |
0.5 |
0.178571 |
0.00 |
0.6 |
0.0 |
0.997097 |
1.000000 |
... |
0.000000 |
0.0 |
0.333333 |
0.225 |
0.833333 |
0.666667 |
0.175 |
0.388889 |
0.066667 |
0.411765 |
|
1467 |
0.214286 |
1.0 |
0.037938 |
0.5 |
0.107143 |
0.50 |
0.2 |
0.0 |
0.998065 |
0.333333 |
... |
0.333333 |
0.0 |
0.333333 |
0.150 |
0.000000 |
0.666667 |
0.150 |
0.111111 |
0.000000 |
0.176471 |
|
1468 |
0.738095 |
0.5 |
0.659270 |
1.0 |
0.035714 |
0.50 |
0.6 |
0.0 |
0.998549 |
1.000000 |
... |
1.000000 |
0.0 |
0.000000 |
0.425 |
0.500000 |
0.333333 |
0.225 |
0.333333 |
0.000000 |
0.470588 |
|
1469 |
0.380952 |
1.0 |
0.376521 |
0.5 |
0.250000 |
0.50 |
0.6 |
0.0 |
1.000000 |
0.333333 |
... |
0.000000 |
0.0 |
0.000000 |
0.150 |
0.500000 |
1.000000 |
0.100 |
0.166667 |
0.066667 |
0.117647 |
1470 rows × 34 columns
In [ ]:
#Splitting Data into Train and Test.
In [25]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_Scaled,y,test_size=0.2,random_state=0)
In [26]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape
Out[26]:
((1176, 34), (294, 34), (1176,), (294,))In [28]:
x_train.head()
Out[28]:
|
Age |
BusinessTravel |
DailyRate |
Department |
DistanceFromHome |
Education |
EducationField |
EmployeeCount |
EmployeeNumber |
EnvironmentSatisfaction |
... |
RelationshipSatisfaction |
StandardHours |
StockOptionLevel |
TotalWorkingYears |
TrainingTimesLastYear |
WorkLifeBalance |
YearsAtCompany |
YearsInCurrentRole |
YearsSinceLastPromotion |
YearsWithCurrManager |
|
|
1374 |
0.952381 |
1.0 |
0.360057 |
1.0 |
0.714286 |
0.50 |
0.2 |
0.0 |
0.937107 |
1.000000 |
... |
0.666667 |
0.0 |
0.333333 |
0.725 |
0.333333 |
0.333333 |
0.025 |
0.000000 |
0.000000 |
0.000000 |
|
1092 |
0.642857 |
1.0 |
0.607015 |
0.5 |
0.964286 |
0.50 |
1.0 |
0.0 |
0.747460 |
1.000000 |
... |
1.000000 |
0.0 |
0.333333 |
0.200 |
0.500000 |
0.666667 |
0.125 |
0.222222 |
0.000000 |
0.176471 |
|
768 |
0.523810 |
1.0 |
0.141732 |
1.0 |
0.892857 |
0.50 |
0.4 |
0.0 |
0.515239 |
0.666667 |
... |
0.333333 |
0.0 |
0.333333 |
0.200 |
0.500000 |
0.333333 |
0.175 |
0.388889 |
0.466667 |
0.294118 |
|
569 |
0.428571 |
0.0 |
0.953472 |
1.0 |
0.250000 |
0.75 |
0.2 |
0.0 |
0.381229 |
0.000000 |
... |
0.333333 |
0.0 |
0.000000 |
0.250 |
0.166667 |
0.666667 |
0.250 |
0.388889 |
0.000000 |
0.529412 |
|
911 |
0.166667 |
0.5 |
0.355762 |
1.0 |
0.821429 |
0.00 |
0.2 |
0.0 |
0.615385 |
0.666667 |
... |
1.000000 |
0.0 |
0.000000 |
0.025 |
0.666667 |
0.666667 |
0.025 |
0.000000 |
0.066667 |
0.000000 |
5 rows × 34 columns
In [29]:
from sklearn.linear_model import LogisticRegression
modellr=LogisticRegression()
In [30]:
modellr.fit(x_train,y_train)
Out[30]:
LogisticRegression()
In [31]:
pred=modellr.predict(x_test)
pred
Out[31]:
array(['No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No',
'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No'], dtype=object)In [32]:
y_test
Out[32]:
442 No1091 No981 Yes785 No1332 Yes ... 1439 No481 No124 Yes198 No1229 NoName: Attrition, Length: 294, dtype: objectIn [ ]:
#Evaluation of Classification Model
In [33]:
#Accuracy score
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score,roc_curve
In [34]:
accuracy_score(y_test,pred)
Out[34]:
0.8843537414965986In [35]:
confusion_matrix(y_test,pred)
Out[35]:
array([[242, 3],
[ 31, 18]], dtype=int64)In [36]:
pd.crosstab(y_test,pred)
Out[36]:
|
col_0 |
No |
Yes |
|
Attrition |
||
|
No |
242 |
3 |
|
Yes |
31 |
18 |
In [ ]:
#Performance Metrics:
In [37]:
print(classification_report(y_test,pred))
precision recall f1-score support No 0.89 0.99 0.93 245 Yes 0.86 0.37 0.51 49 accuracy 0.88 294 macro avg 0.87 0.68 0.72 294weighted avg 0.88 0.88 0.86 294 In [38]:
probability=modellr.predict_proba(x_test)[:,1]
probability
Out[38]:
array([0.16000127, 0.20600667, 0.31532384, 0.09242886, 0.63667551,
0.06153061, 0.61819432, 0.0757087 , 0.00841372, 0.3912069 , 0.05398439, 0.33293123, 0.02020698, 0.67215483, 0.19786547, 0.03454902, 0.11043981, 0.17101703, 0.04477777, 0.22783614, 0.2335018 , 0.01553905, 0.06464492, 0.05029956, 0.58792413, 0.44849464, 0.07412714, 0.04460935, 0.67666632, 0.0584383 , 0.01599026, 0.03521098, 0.06963085, 0.17397462, 0.07830857, 0.04288032, 0.08150424, 0.07106342, 0.03622137, 0.05223965, 0.04862098, 0.02091497, 0.01819361, 0.01362467, 0.02873997, 0.50236969, 0.41553218, 0.00306874, 0.73976412, 0.51382382, 0.09637213, 0.48845516, 0.08036228, 0.25757243, 0.66516772, 0.26308027, 0.01964858, 0.30198497, 0.02919946, 0.16038964, 0.02102747, 0.21670232, 0.13981568, 0.0358316 , 0.37208403, 0.03002317, 0.29091186, 0.16041142, 0.10437497, 0.08695177, 0.08217589, 0.30984518, 0.08531362, 0.07420689, 0.12268651, 0.06192552, 0.04640904, 0.07624712, 0.19738483, 0.03236316, 0.00884439, 0.0244108 , 0.13635803, 0.0260104 , 0.03341008, 0.08186888, 0.00499397, 0.03474852, 0.03858027, 0.14602694, 0.26167665, 0.16667357, 0.27400109, 0.24159565, 0.02160421, 0.17748606, 0.34076078, 0.28022482, 0.06914126, 0.05003806, 0.24437761, 0.74698271, 0.35438567, 0.01920627, 0.08778845, 0.03255847, 0.05461351, 0.15123251, 0.06843702, 0.13752637, 0.09584388, 0.04669882, 0.02493091, 0.15383171, 0.07081259, 0.03089296, 0.0537667 , 0.11554316, 0.00881616, 0.01263271, 0.17552253, 0.05045234, 0.08823238, 0.82995757, 0.03017756, 0.0236819 , 0.0087012 , 0.1349589 , 0.16474801, 0.05202613, 0.01524549, 0.29278083, 0.54767448, 0.34275448, 0.04629541, 0.38966344, 0.61333366, 0.14552367, 0.07402366, 0.24143471, 0.09418418, 0.0689069 , 0.10061956, 0.19346327, 0.20026293, 0.03004939, 0.14900424, 0.00348846, 0.11225149, 0.15843155, 0.06047573, 0.18601882, 0.06085869, 0.12221317, 0.03280184, 0.02738799, 0.06356425, 0.08302382, 0.01541716, 0.014665 , 0.38517822, 0.01264231, 0.14961974, 0.80508787, 0.11598661, 0.2842811 , 0.17020143, 0.1530583 , 0.02764153, 0.00613226, 0.04191632, 0.09782393, 0.11551417, 0.10377982, 0.01779313, 0.14371315, 0.10615435, 0.10298963, 0.05132621, 0.09061081, 0.02897383, 0.09924087, 0.00512032, 0.75108423, 0.04296968, 0.04062134, 0.37518972, 0.04563128, 0.7251816 , 0.10671665, 0.36949086, 0.38146941, 0.32095493, 0.05266802, 0.08172004, 0.13947833, 0.04334317, 0.01469593, 0.26413988, 0.06330966, 0.1614747 , 0.15380517, 0.67152357, 0.05840793, 0.27891823, 0.04512564, 0.46033865, 0.00348431, 0.14068967, 0.02747401, 0.12714133, 0.17284246, 0.07341066, 0.10099827, 0.16870885, 0.02560842, 0.01824031, 0.08670796, 0.02834237, 0.13710215, 0.08778935, 0.2200061 , 0.73401148, 0.15938978, 0.4095449 , 0.01513845, 0.11306309, 0.21497506, 0.32337575, 0.03409266, 0.04256318, 0.32157531, 0.05454465, 0.02348479, 0.16423352, 0.32696147, 0.22892063, 0.00877159, 0.08198819, 0.01156361, 0.1408691 , 0.29235147, 0.01270305, 0.17329916, 0.04081391, 0.04094165, 0.42771425, 0.34958286, 0.03766772, 0.12025286, 0.37698923, 0.3192629 , 0.79559338, 0.05385659, 0.21597037, 0.06383728, 0.00570991, 0.66018187, 0.35855286, 0.37783606, 0.36781398, 0.03554512, 0.21718203, 0.05943622, 0.06554485, 0.10081475, 0.00818713, 0.26591316, 0.42809675, 0.06542835, 0.09296803, 0.01259826, 0.14226651, 0.05072662, 0.02372258, 0.02586923, 0.06760427, 0.24315648, 0.26961432, 0.19831733, 0.2652296 , 0.0165923 , 0.15784236, 0.08398982, 0.02711775, 0.18750547, 0.00783535, 0.2844239 , 0.00270742, 0.02484969, 0.22585745, 0.72775605, 0.07691547, 0.26304359])In [ ]:
In [ ]:
DECISION TREE
In [ ]:
In [39]:
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier()
dtc.fit(x_train,y_train)
Out[39]:
DecisionTreeClassifier()
In [40]:
pred=dtc.predict(x_test)
pred
Out[40]:
array(['No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No',
'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No'], dtype=object)In [41]:
y_test
Out[41]:
442 No1091 No981 Yes785 No1332 Yes ... 1439 No481 No124 Yes198 No1229 NoName: Attrition, Length: 294, dtype: objectIn [ ]:
#Evaluation of Classification Model
In [42]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score,roc_curve
accuracy_score(y_test,pred)
Out[42]:
0.7414965986394558In [43]:
confusion_matrix(y_test,pred)
Out[43]:
array([[204, 41],
[ 35, 14]], dtype=int64)In [44]:
pd.crosstab(y_test,pred)
Out[44]:
|
col_0 |
No |
Yes |
|
Attrition |
||
|
No |
204 |
41 |
|
Yes |
35 |
14 |
In [45]:
#Performance Metrics:
print(classification_report(y_test,pred))
precision recall f1-score support No 0.85 0.83 0.84 245 Yes 0.25 0.29 0.27 49 accuracy 0.74 294 macro avg 0.55 0.56 0.56 294weighted avg 0.75 0.74 0.75 294 In [ ]:
In [ ]:
RANDOM FOREST
In [ ]:
In [52]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
rfc=RandomForestClassifier()
In [53]:
forest_params = [{'max_depth': list(range(10, 15)), 'max_features': list(range(0,14))}]
In [54]:
rfc_cv= GridSearchCV(rfc,param_grid=forest_params,cv=10,scoring="accuracy")
In [58]:
rfc_cv.fit(x_train,y_train)
C:\Users\mahes\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning: 50 fits failed out of a total of 700.The score on these train-test partitions for these parameters will be set to nan.If these failures are not expected, you can try to debug them by setting error_score='raise'. Below are more details about the failures:--------------------------------------------------------------------------------50 fits failed with the following error:Traceback (most recent call last): File "C:\Users\mahes\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score estimator.fit(X_train, y_train, **fit_params) File "C:\Users\mahes\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit trees = Parallel( File "C:\Users\mahes\anaconda3\lib\site-packages\joblib\parallel.py", line 1043, in __call__ if self.dispatch_one_batch(iterator): File "C:\Users\mahes\anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch self._dispatch(tasks) File "C:\Users\mahes\anaconda3\lib\site-packages\joblib\parallel.py", line 779, in _dispatch job = self._backend.apply_async(batch, callback=cb) File "C:\Users\mahes\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async result = ImmediateResult(func) File "C:\Users\mahes\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 572, in __init__ self.results = batch() File "C:\Users\mahes\anaconda3\lib\site-packages\joblib\parallel.py", line 262, in __call__ return [func(*args, **kwargs) File "C:\Users\mahes\anaconda3\lib\site-packages\joblib\parallel.py", line 262, in <listcomp> return [func(*args, **kwargs) File "C:\Users\mahes\anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 216, in __call__ return self.function(*args, **kwargs) File "C:\Users\mahes\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 185, in _parallel_build_trees tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False) File "C:\Users\mahes\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit super().fit( File "C:\Users\mahes\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 308, in fit raise ValueError("max_features must be in (0, n_features]")ValueError: max_features must be in (0, n_features]
warnings.warn(some_fits_failed_message, FitFailedWarning)C:\Users\mahes\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:969: UserWarning: One or more of the test scores are non-finite: [ nan 0.8460959 0.85203535 0.85969868 0.85627988 0.85712734 0.85796755 0.86224105 0.8596842 0.85797479 0.86136462 0.85796031 0.85881501 0.85543242 nan 0.84610314 0.85630161 0.85459221 0.86054614 0.85967695 0.85883674 0.85883674 0.86221932 0.86393597 0.85799652 0.8605389 0.85795306 0.86309576 nan 0.85290453 0.85459945 0.85458496 0.85373026 0.85969868 0.86222657 0.85798928 0.86223381 0.85713458 0.86563813 0.86222657 0.85796755 0.85797479 nan 0.85289729 0.85544691 0.85970592 0.85883674 0.85969144 0.85883674 0.8596842 0.85712734 0.85711285 0.85969868 0.85880776 0.85797479 0.86051717 nan 0.84948573 0.8571708 0.85883674 0.85457772 0.86137911 0.85798928 0.86650732 0.86308127 0.86053165 0.86051717 0.85796755 0.86052441 0.85711285] warnings.warn(Out[58]:
GridSearchCV(cv=10, estimator=RandomForestClassifier(),
param_grid=[{'max_depth': [10, 11, 12, 13, 14], 'max_features': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]}], scoring='accuracy')In [59]:
pred=rfc_cv.predict(x_test)
print(classification_report(y_test,pred))
precision recall f1-score support No 0.85 0.98 0.91 245 Yes 0.67 0.16 0.26 49 accuracy 0.85 294 macro avg 0.76 0.57 0.59 294weighted avg 0.82 0.85 0.81 294 In [60]:
rfc_cv.best_params_
Out[60]:
{'max_depth': 14, 'max_features': 7}In [ ]:
In [ ]:
In [ ]: